Analisis de los datos de Strava¶
In [3]:
%%capture
# Install necessary packages
!pip install fitdecode
!pip install Path
!pip install zipfile36
!pip install dateparser
In [4]:
# Let plotly know it runs inside a Jupyter Notebook
import plotly.io as pio
pio.renderers.default = 'notebook'
Importing the necesary data
In [7]:
# Importing packages
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import dateparser
# Positions of relevant columns
usecols = [0, 1, 2, 3, 4, 5, 7, 8, 12, 16, 17, 18, 20, 31]
# English column names
names = [
"Activity ID",
"Activity Date",
"Activity Name",
"Activity Type",
"Activity Description",
"Elapsed Time",
"Max Heart Rate",
"Relative Effort",
"Filename",
"Moving Time",
"Distance",
"Max Pace",
"Elevation Gain",
"Average Heart Rate",
]
# Reading the raw data for reference
raw = pd.read_csv("activities.csv")
# Reading the raw data with preprocessing
df = pd.read_csv(
"activities.csv",
index_col=0,
parse_dates=[1],
usecols=usecols,
names=names,
header=0,
date_parser=dateparser.parse,
)
print(f"{raw.shape[0]} rows in raw file")
# Drop columns with missing values in Moving Time and Distance
df = df.dropna(axis=0, subset=["Moving Time", "Distance"])
print(f"{df.shape[0]} rows remaining after cleaning")
# Creating new columns:
# Add day, week, month, quarter, year columns
names = ["Day", "Week", "Month", "Quarter", "Year"]
periods = ["D", "W", "M", "Q", "y"]
for n, p in zip(names, periods):
df.insert(3, n, df["Activity Date"].dt.to_period(p).astype(str))
# Convert moving time from seconds to hours
df.insert(13, "Moving Time (hr)", df["Moving Time"] / 3600)
# Convert distance from meters to kilometers
df.insert(16, "Distance (km)", df["Distance"] / 1000)
# Calculate average speed
df.insert(17, "Average Speed (km/hr)", df["Distance (km)"] / df["Moving Time (hr)"])
# Calculate maximum speed
df.insert(18, "Max Speed (km/hr)", df["Max Pace"]*3.6)
# Print date bounds of the data
print(f"Ranges from {df.Day.min()} to {df.Day.max()}")
# Preview the data
df.tail()
/tmp/ipykernel_3631/395550362.py:33: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.
550 rows in raw file 550 rows remaining after cleaning Ranges from 2018-09-09 to 2023-08-30
Out[7]:
| Activity Date | Activity Name | Activity Type | Year | Quarter | Month | Week | Day | Activity Description | Elapsed Time | ... | Filename | Moving Time (hr) | Moving Time | Distance | Distance (km) | Average Speed (km/hr) | Max Speed (km/hr) | Max Pace | Elevation Gain | Average Heart Rate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Activity ID | |||||||||||||||||||||
| 9638976449 | 2023-08-13 09:34:26 | Bicicleta a la hora del almuerzo | Bicicleta | 2023 | 2023Q3 | 2023-08 | 2023-08-07/2023-08-13 | 2023-08-13 | NaN | 9806 | ... | activities/10334897911.fit.gz | 2.180833 | 7851.0 | 44452.949219 | 44.452949 | 20.383469 | 55.281095 | 15.355860 | 1087.0 | NaN |
| 9644330136 | 2023-08-05 16:55:14 | Carrera de noche | Carrera | 2023 | 2023Q3 | 2023-08 | 2023-07-31/2023-08-06 | 2023-08-05 | NaN | 1870 | ... | activities/10340562580.fit.gz | 0.516111 | 1858.0 | 6679.859863 | 6.679860 | 12.942678 | 26.819999 | 7.450000 | 0.0 | 185.839615 |
| 9683261634 | 2023-08-20 06:21:37 | Bicicleta por la mañana | Bicicleta | 2023 | 2023Q3 | 2023-08 | 2023-08-14/2023-08-20 | 2023-08-20 | NaN | 4957 | ... | activities/10381618854.fit.gz | 1.183333 | 4260.0 | 24721.830078 | 24.721830 | 20.891687 | 49.886248 | 13.857291 | 535.0 | NaN |
| 9750863053 | 2023-08-21 17:16:47 | Bicicleta al anochecer | Bicicleta | 2023 | 2023Q3 | 2023-08 | 2023-08-21/2023-08-27 | 2023-08-21 | NaN | 6049 | ... | activities/10453731608.fit.gz | 1.405556 | 5060.0 | 25067.339844 | 25.067340 | 17.834471 | 36.490079 | 10.136133 | 62.0 | NaN |
| 9750871220 | 2023-08-30 15:58:50 | Bicicleta por la tarde | Bicicleta | 2023 | 2023Q3 | 2023-08 | 2023-08-28/2023-09-03 | 2023-08-30 | NaN | 5646 | ... | activities/10453739983.fit.gz | 1.412778 | 5086.0 | 29676.060547 | 29.676061 | 21.005470 | 54.762891 | 15.211914 | 667.0 | NaN |
5 rows × 22 columns
Calculating the cumulative sums
In [8]:
# Define a time unit: "Year", "Quarter", "Month", "Week", or "Day"
time_unit_km = "Month"
# Group by time_unit_km and activity type
df_km = df.groupby(by=[time_unit_km, "Activity Type"], as_index=False).agg(
count=("Distance (km)", "count"),
total_distance_km=("Distance (km)", "sum"),
avg_distance_km=("Distance (km)", "mean"),
)
# For each activity and time period, make sure there exists a row
# This will ensure there is point on the plot for each combination
acts = df_km["Activity Type"].unique()
times = df_km[time_unit_km].unique()
# Create a list to store new rows
new_rows = []
for a in acts:
temp = df_km.loc[df_km["Activity Type"] == a]
for t in times:
if not (temp[time_unit_km] == t).any():
new_row = {
time_unit_km: t,
"Activity Type": a,
"count": 0,
"total_distance_km": 0,
"avg_distance_km": 0,
}
new_rows.append(new_row)
# Concatenate the new rows to the DataFrame
df_km = pd.concat([df_km, pd.DataFrame(new_rows)], ignore_index=True)
# Find and exclude activities with <= 1 km total covered (e.g., weight training)
# You can increase or decrease this cutoff based on your data
kms = df_km.groupby(by=["Activity Type"], as_index=False).sum()
kms = kms[kms["total_distance_km"] > 1]
# For each activity and time period, calculate the cumulative sum of kms
csum = df_km.loc[df_km["Activity Type"].isin(kms["Activity Type"])]
x = pd.Series(dtype=float)
csum.sort_values(by=["Activity Type", time_unit_km], inplace=True)
csum["csum_km"] = csum.groupby("Activity Type")["total_distance_km"].cumsum()
csum
Out[8]:
| Month | Activity Type | count | total_distance_km | avg_distance_km | csum_km | |
|---|---|---|---|---|---|---|
| 0 | 2018-09 | Bicicleta | 6 | 159.075098 | 26.512516 | 159.075098 |
| 100 | 2018-10 | Bicicleta | 0 | 0.000000 | 0.000000 | 159.075098 |
| 2 | 2018-11 | Bicicleta | 1 | 25.858400 | 25.858400 | 184.933499 |
| 4 | 2018-12 | Bicicleta | 2 | 25.319801 | 12.659900 | 210.253299 |
| 5 | 2019-01 | Bicicleta | 1 | 44.595602 | 44.595602 | 254.848901 |
| ... | ... | ... | ... | ... | ... | ... |
| 91 | 2023-04 | Carrera | 14 | 126.872651 | 9.062332 | 2063.925490 |
| 93 | 2023-05 | Carrera | 14 | 147.999292 | 10.571378 | 2211.924782 |
| 95 | 2023-06 | Carrera | 16 | 137.116259 | 8.569766 | 2349.041041 |
| 97 | 2023-07 | Carrera | 9 | 91.196648 | 10.132961 | 2440.237690 |
| 99 | 2023-08 | Carrera | 2 | 11.947680 | 5.973840 | 2452.185369 |
165 rows × 6 columns
Análisis general¶
Gráfica de los kilometros recorridos por tipo de actividad
In [9]:
# For the plot tile
total_km = round(df_km["total_distance_km"].sum())
# Plot a stacked area plot
fig_km = px.area(
csum,
x=time_unit_km,
y="csum_km",
color="Activity Type",
title=f"My {total_km} Kilometers on Strava!", # Set title text
hover_data={ # Define variables for hover text
"csum_km": ":.1f",
"count": ":f",
"total_distance_km": ":.1f",
"avg_distance_km": ":.1f",
},
labels=dict( # Define labels for variables
count="Number of activities",
avg_distance_km="Average kms per activity",
total_distance_km="Total kms covered",
csum_km="Cumulative kms covered",
),
color_discrete_sequence=px.colors.qualitative.Bold, # Define color swatch
)
# Set max allowed of ticks on x and y axes
fig_km.update_xaxes(nticks=20)
fig_km.update_yaxes(nticks=15)
# Adjust the size and layout
fig_km.update_layout(
autosize=False,
width=700,
height=500,
template="plotly_white", # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"}, # Center title
)
fig_km.show()
- Las salidas en bicicleta acumulan la gran parte de los kilometros recorridos
Horas dedicadas a cada tipo de actividad
In [10]:
# Define a time unit: "Year", "Quarter", "Month", "Week", or "Day"
time_unit_bar = "Month"
# Group by time_unit_bar and activity type
df_hr = df.groupby(by=[time_unit_bar, "Activity Type"], as_index=False).agg(
count=("Moving Time (hr)", "count"),
total_hr_spent=("Moving Time (hr)", "sum"),
avg_hr_spent=("Moving Time (hr)", "mean"),
)
# For the plot tile
total_hr = round(df_hr["total_hr_spent"].sum())
# Plot a stacked bar plot
fig_hr = px.bar(
df_hr,
x=time_unit_bar,
y="total_hr_spent",
color="Activity Type",
title=f"My {total_hr} hours on Strava!", # Set title text
hover_data={ # Define variables for hover text
"count": ":f",
"total_hr_spent": ":.1f",
"avg_hr_spent": ":.1f",
},
labels=dict( # Define labels for variables
total_hr_spent="Total hrs spent",
count="Number of activities",
avg_hr_spent="Average hrs spent per activity",
),
color_discrete_sequence=px.colors.qualitative.Bold, # Define color swatch
)
# Set max allowed of ticks on x and y axes
fig_hr.update_xaxes(nticks=20)
fig_hr.update_yaxes(nticks=15)
# Adjust the size and layout
fig_hr.update_layout(
autosize=False,
width=700,
height=500,
template="plotly_white", # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
legend=dict( # Move the legend to the bottom
orientation="h",
yanchor="bottom",
y=-0.6,
xanchor="right",
x=1,
title=None, # Remove legend title
),
title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"}, # Center title
)
fig_hr.show()
- Se observa una evolución histórica desde más horas de bicicleta a más horas de correr
- La estacionalidad muestra que los meses de verano hay más horas dedicadas a la bicicleta
Análisis de las actividades de carrera¶
In [14]:
# Find the activity with the most kms
most_kms = (
df_km.groupby(by=["Activity Type"], as_index=False)
.sum()
.sort_values(by="total_distance_km")
)
# Define an activity here
activity = most_kms["Activity Type"].values[1]
# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
bin_labels.append(
f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
)
speed.insert(
0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)
# Create a scatter plot with four subplots
fig_s = px.scatter(
speed,
x="Activity Date",
y="Average Speed (km/hr)",
facet_col="distance_bin",
color="Distance (km)",
trendline="ols", # Add a black trend line
trendline_color_override="black",
color_continuous_scale="thermal", # Define a color scale
title="My average speed on " + activity.lower() + "s", # Set title text
category_orders={"distance_bin": bin_labels}, # Ascending order
custom_data=["Activity Name", "Distance (km)", "Elevation Gain"], # Variables for the hover text
)
# Customize the hover text
fig_s.update_traces(
hovertemplate="Activity Name: %{customdata[0]}<br>"
"Activity date: %{x|%Y-%m-%d}<br>"
"Distance (km): %{customdata[1]:.1f}<br>"
"Average speed (km/hr): %{y:.1f}<br>"
"Elevation gain: %{customdata[2]:.1f}"
)
# Adjust the size and layout
fig_s.update_layout(
autosize=False,
width=700,
height=600,
template="seaborn", # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)
# Hide subplot annotations and x-axis titles
fig_s.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for axis in fig_s.layout:
if type(fig_s.layout[axis]) == go.layout.XAxis:
fig_s.layout[axis].title.text = ""
# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)
# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))
# Make the color bar smaller
fig_s.update_coloraxes(
colorbar_thickness=15,
colorbar_title_text="km",
colorbar_title_font_size=12,
colorbar_tickfont_size=10,
colorbar_ticklen=3,
)
fig_s.show()
In [24]:
# Find the activity with the most kms
most_kms = (
df_km.groupby(by=["Activity Type"], as_index=False)
.sum()
.sort_values(by="total_distance_km")
)
# Define an activity here
activity = most_kms["Activity Type"].values[1]
# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
bin_labels.append(
f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
)
speed.insert(
0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)
# Create a scatter plot with four subplots
fig_s = px.scatter(
speed,
x="Distance (km)",
y="Average Speed (km/hr)",
color="Elevation Gain",
trendline="ols", # Add a black trend line
trendline_color_override="black",
color_continuous_scale="thermal", # Define a color scale
title="My average speed on " + activity.lower() + "s", # Set title text
custom_data=["Elevation Gain", "Activity Date"], # Variables for the hover text
)
# Customize the hover text
fig_s.update_traces(
hovertemplate=
"Activity Date: %{customdata[1]|%Y-%m-%d}<br>"
"Distance (km): %{x:.1f}<br>"
"Average speed (km/hr): %{y:.1f}<br>"
"Elevation gain: %{customdata[0]:.1f}"
)
# Adjust the size and layout
fig_s.update_layout(
autosize=False,
width=700,
height=600,
template="seaborn", # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)
# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)
# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))
# Make the color bar smaller
fig_s.update_coloraxes(
colorbar_thickness=15,
colorbar_title_text="Elevation gain",
colorbar_title_font_size=12,
colorbar_tickfont_size=10,
colorbar_ticklen=3,
)
fig_s.show()
In [30]:
# Define a time unit: "Year", "Quarter", "Month", "Week", or "Day"
time_unit_bar = "Week"
df_carrera=df[(df['Activity Type']=='Carrera') & (df['Activity Date']>"2022-09-01 00:00:00")]
# Group by time_unit_bar and activity type
df_hr = df_carrera.groupby(by=[time_unit_bar, "Activity Type"], as_index=False).agg(
count=("Moving Time (hr)", "count"),
total_hr_spent=("Distance (km)", "sum"),
avg_hr_spent=("Distance (km)", "mean"),
)
# For the plot tile
total_hr = round(df_hr["total_hr_spent"].sum())
# Plot a stacked bar plot
fig_hr = px.bar(
df_hr,
x=time_unit_bar,
y="total_hr_spent",
color="avg_hr_spent",
title=f"My {total_hr} hours on Strava!", # Set title text
hover_data={ # Define variables for hover text
"count": ":f",
"total_hr_spent": ":.1f",
"avg_hr_spent": ":.1f",
},
labels=dict( # Define labels for variables
total_hr_spent="Total hrs spent",
count="Number of activities",
avg_hr_spent="Average hrs spent per activity",
),
color_discrete_sequence=px.colors.qualitative.Bold, # Define color swatch
)
# Set max allowed of ticks on x and y axes
fig_hr.update_xaxes(nticks=20)
fig_hr.update_yaxes(nticks=15)
# Adjust the size and layout
fig_hr.update_layout(
autosize=False,
width=700,
height=500,
template="plotly_white", # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
legend=dict( # Move the legend to the bottom
orientation="h",
yanchor="bottom",
y=-0.6,
xanchor="right",
x=1,
title=None, # Remove legend title
),
title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"}, # Center title
)
fig_hr.show()
Análisis de las actividades de bicicleta¶
In [31]:
# Find the activity with the most kms
most_kms = (
df_km.groupby(by=["Activity Type"], as_index=False)
.sum()
.sort_values(by="total_distance_km")
)
# Define an activity here
activity = most_kms["Activity Type"].values[-1]
# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
bin_labels.append(
f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
)
speed.insert(
0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)
# Create a scatter plot with four subplots
fig_s = px.scatter(
speed,
x="Activity Date",
y="Average Speed (km/hr)",
facet_col="distance_bin",
color="Distance (km)",
trendline="ols", # Add a black trend line
trendline_color_override="black",
color_continuous_scale="thermal", # Define a color scale
title="My average speed on " + activity.lower() + "s", # Set title text
category_orders={"distance_bin": bin_labels}, # Ascending order
custom_data=["Activity Name", "Distance (km)", "Elevation Gain"], # Variables for the hover text
)
# Customize the hover text
fig_s.update_traces(
hovertemplate="Activity Name: %{customdata[0]}<br>"
"Activity date: %{x|%Y-%m-%d}<br>"
"Distance (km): %{customdata[1]:.1f}<br>"
"Average speed (km/hr): %{y:.1f}<br>"
"Elevation gain: %{customdata[2]:.1f}"
)
# Adjust the size and layout
fig_s.update_layout(
autosize=False,
width=700,
height=600,
template="seaborn", # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)
# Hide subplot annotations and x-axis titles
fig_s.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for axis in fig_s.layout:
if type(fig_s.layout[axis]) == go.layout.XAxis:
fig_s.layout[axis].title.text = ""
# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)
# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))
# Make the color bar smaller
fig_s.update_coloraxes(
colorbar_thickness=15,
colorbar_title_text="km",
colorbar_title_font_size=12,
colorbar_tickfont_size=10,
colorbar_ticklen=3,
)
fig_s.show()
In [35]:
# Find the activity with the most kms
most_kms = (
df_km.groupby(by=["Activity Type"], as_index=False)
.sum()
.sort_values(by="total_distance_km")
)
# Define an activity here
activity = most_kms["Activity Type"].values[-1]
# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
bin_labels.append(
f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
)
speed.insert(
0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)
# Create a scatter plot with four subplots
fig_s = px.scatter(
speed,
x="Elevation Gain",
y="Average Speed (km/hr)",
color="Distance (km)",
trendline="ols", # Add a black trend line
trendline_color_override="black",
color_continuous_scale="thermal", # Define a color scale
title="My average speed on " + activity.lower() + "s", # Set title text
custom_data=["Elevation Gain", "Activity Date"], # Variables for the hover text
)
# Customize the hover text
fig_s.update_traces(
hovertemplate=
"Activity Date: %{customdata[1]|%Y-%m-%d}<br>"
"Distance (km): %{x:.1f}<br>"
"Average speed (km/hr): %{y:.1f}<br>"
"Elevation gain: %{customdata[0]:.1f}"
)
# Adjust the size and layout
fig_s.update_layout(
autosize=False,
width=700,
height=600,
template="seaborn", # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)
# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)
# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))
# Make the color bar smaller
fig_s.update_coloraxes(
colorbar_thickness=15,
colorbar_title_text="Elevation gain",
colorbar_title_font_size=12,
colorbar_tickfont_size=10,
colorbar_ticklen=3,
)
fig_s.show()
In [ ]: